Module 06
Statistical learning refers to a vast set of tools for understanding data. These tools can be classified as supervised or unsupervised. Broadly speaking, supervised statistical learning involves building a statistical model for predicting, or estimating, an output based on one or more inputs.
- Introduction, ISLR
\[ Y = f(X_1, X_2, \ldots, X_p) + \epsilon \]
\(f\) is a fixed but unknown function of \(X_1, X_2, \ldots, X_p\), and \(\epsilon\) is a random error term, which is independent of \(X\) and has a mean of zero.
We want to predict a value Y based on the values of X assuming a functional relationship between Y and X.
p1 <- advertising |>
ggplot(aes(x = TV, y = Sales)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 2) +
labs(x = "TV", y = "Sales") +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_light()
p2 <- advertising |>
ggplot(aes(x = Radio, y = Sales)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 2) +
labs(x = "Radio", y = "Sales") +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_light()
p3 <- advertising |>
ggplot(aes(x = Newspaper, y = Sales)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 2) +
labs(x = "Newspaper", y = "Sales") +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_light()
p1 + p2 + p3income_1 <- read_csv("datasets/Income1.csv", col_names = TRUE, col_select = c(2:3), col_types = "dd")
# income_1
income_1 |>
ggplot(aes(x = Education, y = Income)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
geom_smooth(method = "loess", color = "orange", se = FALSE) +
labs(x = "Years of Education", y = "Income") +
scale_y_continuous(breaks = seq(0, 100, 10)) +
scale_x_continuous(limits = c(10, 22), breaks = seq(10, 22, 2)) +
theme_light()\[ \textbf{Income} = \beta_0 + \beta_1 \times \textbf{Education} \\ \, \\ \textbf{Income} = \frac{a}{1 + \exp(-b(\textbf{Education} - c))} + d \]
How do we choose our model?
income_function <- function(x, a, b, c, d) {
d + a / (1 + exp(-b * (x - c)))
}
a <- 60
b <- 0.5
c <- 16
d <- 20
fit <- nls(Income ~ income_function(Education, a, b, c, d), data = income_1,
start = list(a = a, b = b, c = c, d = d))
coef_fit <- coef(fit)
# coef_fit
income_1_augment <- augment(fit, income_1)
p1 <- income_1_augment |>
ggplot(aes(x = Education, y = Income)) +
geom_segment(aes(xend = Education, yend = .fitted), color = "black", linetype = "solid", linewidth = 0.5) +
geom_point(color = "red", shape = "circle") +
geom_function(fun = function(x) income_function(x,
a = coef_fit["a"],
b = coef_fit["b"],
c = coef_fit["c"],
d = coef_fit["d"]),
color = "blue", linewidth = 1) +
labs(x = "Years of Education", y = "Income") +
scale_y_continuous(breaks = seq(0, 100, 10)) +
scale_x_continuous(limits = c(10, 22), breaks = seq(10, 22, 2)) +
theme_light()
p2 <- income_1_augment |>
ggplot(aes(x = Education, y = .resid)) +
geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
geom_point(color = "red", shape = "circle") +
labs(x = "Years of Education", y = "Residuals") +
scale_y_continuous(limits = c(-10, 10), breaks = seq(-10, 10, 5)) +
scale_x_continuous(limits = c(10, 22), breaks = seq(10, 22, 2)) +
theme_light()
p1 / p2 +
plot_layout(heights = c(3, 1), axes = "collect")# fit the income data to a function that is sigmoid shape in Education and linear in Seniority
income_function_2 <- function(x, y, a, b, c, d, e) {
d + a / (1 + exp(-b * (x - c))) + e * y
}
a <- 60
b <- 0.5
c <- 16
d <- 20
e <- 0.5
fit_2 <- nls(Income ~ income_function_2(Education, Seniority, a, b, c, d, e), data = income_2,
start = list(a = a, b = b, c = c, d = d, e = e))
# summary(fit_2)
# coef_fit_2 <- coef(fit_2)
# coef_fit_2
income_2_augment <- augment(fit_2, income_2)
## plot predicted values against actual values
p1 <- income_2_augment |>
ggplot(aes(x = Income, y = .fitted)) +
geom_point(color = "red", shape = "circle") +
geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
labs(x = "Actual Income", y = "Predicted Income") +
scale_x_continuous(breaks = seq(0, 100, 10)) +
scale_y_continuous(breaks = seq(0, 100, 10)) +
theme_light()
p2 <- income_2_augment |>
ggplot(aes(x = Income, y = .resid)) +
geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
geom_point(color = "red", shape = "circle") +
labs(x = "Actual Income", y = "Residuals") +
scale_x_continuous(breaks = seq(0, 100, 10)) +
scale_y_continuous(limits = c(-15, 15), breaks = seq(-10, 10, 5)) +
theme_light()
p1 / p2 +
plot_layout(heights = c(3, 1), axes = "collect")\[ \textbf{Income} = \frac{a}{1 + \exp(-b(\textbf{Education} - c))} + d + e (\textbf{Seniority}) \]
predict_income <- function(education, seniority) {
coef <- coef(fit_2)
coef['d'] + coef['a'] / (1 + exp(-coef['b'] * (education - coef['c']))) + coef['e'] * seniority
}
# Create sequences and grid
education_seq <- seq(min(income_2$Education), max(income_2$Education), length.out = 100)
seniority_seq <- seq(min(income_2$Seniority), max(income_2$Seniority), length.out = 100)
grid <- expand.grid(Education = education_seq, Seniority = seniority_seq)
grid$Income <- predict_income(grid$Education, grid$Seniority)
# 3. 3D scatter plot
p3 <- plotly::plot_ly(width = 500, height = 500) %>%
plotly::add_markers(data = grid, x = ~Education, y = ~Seniority, z = ~Income,
marker = list(size = 3, color = ~Income, colorscale = "Viridis"),
showlegend = FALSE) %>%
plotly::add_markers(data = income_2, x = ~Education, y = ~Seniority, z = ~Income,
marker = list(size = 5, color = "red", symbol = "circle"),
showlegend = FALSE) %>%
plotly::layout(scene = list(
xaxis = list(title = "Education"),
yaxis = list(title = "Seniority"),
zaxis = list(title = "Income")
),
title = "3D Income Scatter")
# 4. Static 3D surface plot
# png("persp_plot.png", width = 800, height = 600)
# z_matrix <- matrix(grid$Income, nrow = length(education_seq), ncol = length(seniority_seq))
# persp(education_seq, seniority_seq, z_matrix,
# theta = 30, phi = 30, expand = 0.5, col = "lightblue",
# xlab = "Education", ylab = "Seniority", zlab = "Income",
# main = "3D Surface: Income vs Education and Seniority")
# points <- trans3d(income_2$Education, income_2$Seniority, income_2$Income,
# pmat = persp(education_seq, seniority_seq, z_matrix, theta = 30, phi = 30, expand = 0.5, col = "lightblue"))
# points(points, col = "red", pch = 16)
# dev.off()
# Display 3D scatter plot
# The static 3D surface plot is saved as "persp_plot.png"
p3\[ MSE = \frac{1}{n} \sum_{i=1}^{n} \left (y_i - \hat{f}(x_i) \right )^2 \]
Where \(y_i\) is the true value of the response for observation \(i\), and \(\hat{f}(x_i)\) is the predicted value.
set.seed(142)
train_index <- sample(1:n, n * 0.8)
# train_data <- poly_data[train_index, ]
# test_data <- poly_data[-train_index, ]
poly_data_sample <- poly_data |>
mutate(
sample = ifelse(row_number() %in% train_index, "Train", "Test")
)
# poly_data_sample
p1 <- poly_data_sample |>
ggplot(aes(x = x, y = y)) +
geom_point(aes(color = sample, shape = sample), size = 2) +
scale_color_colorblind() +
labs(x = "x", y = "y") +
theme_light()
p1
Call:
lm(formula = y ~ poly(x, degree = 3), data = filter(poly_data_sample,
sample == "Train"))
Residuals:
Min 1Q Median 3Q Max
-3.9110 -1.3845 -0.0798 1.2008 8.4279
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0068 0.2453 12.257 <2e-16 ***
poly(x, degree = 3)1 4.0329 2.1942 1.838 0.070 .
poly(x, degree = 3)2 32.5005 2.1942 14.812 <2e-16 ***
poly(x, degree = 3)3 2.5805 2.1942 1.176 0.243
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.194 on 76 degrees of freedom
Multiple R-squared: 0.7468, Adjusted R-squared: 0.7368
F-statistic: 74.72 on 3 and 76 DF, p-value: < 2.2e-16
poly_data_augment <- augment(poly_fit_train, filter(poly_data_sample, sample == "Train"))
p1 <- poly_data_augment |>
ggplot(aes(x = x, y = y)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_line(aes(y = .fitted), color = "blue", size = 1) +
labs(x = "x", y = "y") +
theme_light()
p1# Print summary of the data
# print(summary(poly_data_sample))
# Check the proportion of train/test split
# print(table(poly_data_sample$sample) / nrow(poly_data_sample))
# Initialize vectors to store MSE values
mse_train <- numeric(10)
mse_test <- numeric(10)
for (i in 1:10) {
# Fit the model on training data
poly_fit_train <- lm(y ~ poly(x, degree = i, raw = TRUE),
data = filter(poly_data_sample, sample == "Train"))
# Calculate MSE for training data
train_augment <- augment(poly_fit_train, newdata = filter(poly_data_sample, sample == "Train"))
mse_train[i] <- mean(train_augment$.resid^2)
# Calculate MSE for test data
test_augment <- augment(poly_fit_train, newdata = filter(poly_data_sample, sample == "Test"))
mse_test[i] <- mean((test_augment$y - test_augment$.fitted)^2)
# Print MSE for this degree
# cat("Degree", i, "- Train MSE:", mse_train[i], "Test MSE:", mse_test[i], "\n")
}
# Create a data frame with results
mse_results <- data.frame(
degree = 1:10,
train_mse = mse_train,
test_mse = mse_test
)
# Print results
# mse_results
# Plot MSE vs Polynomial Degree
ggplot(mse_results, aes(x = degree)) +
geom_smooth(aes(y = train_mse, color = "Training"), se = FALSE, formula = y ~ x, method = "loess") +
geom_smooth(aes(y = test_mse, color = "Test"), se = FALSE, formula = y ~ x, method = "loess") +
geom_point(aes(y = train_mse, color = "Training")) +
geom_point(aes(y = test_mse, color = "Test")) +
scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
labs(title = "MSE vs Polynomial Degree",
x = "Polynomial Degree",
y = "Mean Squared Error",
color = "Dataset") +
theme_light() +
scale_y_continuous(expand = c(0, 0), limits = c(0, NA)) +
scale_x_continuous(limits = c(1, 10), breaks = seq(0, 10, 1))make_moons <- function(n_samples = 100, noise = 0.05, random_state = NULL) {
if (!is.null(random_state)) {
set.seed(random_state)
}
n_samples_out <- ceiling(n_samples / 2)
n_samples_in <- floor(n_samples / 2)
outer_circ_x <- cos(seq(0, pi, length.out = n_samples_out))
outer_circ_y <- sin(seq(0, pi, length.out = n_samples_out))
inner_circ_x <- 1 - cos(seq(0, pi, length.out = n_samples_in))
inner_circ_y <- 1 - sin(seq(0, pi, length.out = n_samples_in)) - 0.5
X <- rbind(
cbind(outer_circ_x, outer_circ_y),
cbind(inner_circ_x, inner_circ_y)
)
y <- c(rep(0, n_samples_out), rep(1, n_samples_in))
if (noise > 0) {
X <- X + matrix(rnorm(n_samples * 2, sd = noise), ncol = 2)
}
return(list(X = X, y = y))
}# Generate moon-shaped data
set.seed(42) # For reproducibility
result <- make_moons(n_samples = 200, noise = 0.3, random_state = 42)
# Extract features and labels
X <- result$X
y <- result$y
df <- data.frame(X, Class = as.factor(y))
ggplot(df, aes(x = outer_circ_x, y = outer_circ_y)) +
geom_point(aes(color = Class, shape = Class), size = 2) +
labs(
x = "X",
y = "Y") +
scale_color_colorblind() +
theme_light() Accuracy
# Split the data into training and testing sets
train_index <- sample(1:nrow(df), nrow(df) * 0.8)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
# Fit the KNN model
knn_fit <- class::knn(train = train_data[, 1:2], test = test_data[, 1:2], cl = train_data$Class, k = 5)
# Calculate accuracy
accuracy <- mean(knn_fit == test_data$Class)
accuracy[1] 0.925
Confusion Matrix
\[ Y \approx \beta_0 + \beta_1 X \]
For the Advertising data, we might have:
\[ \textbf{Sales} \approx \beta_0 + \beta_1 \times \textbf{TV} \]
Where \(\beta_0\) is the intercept and \(\beta_1\) is the slope. These are unknown constants.
Once we estimate the coefficients \(\beta_0\) and \(\beta_1\), we can make predictions.
\[ \hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 X \]
advertising_fit <- lm(Sales ~ TV, data = advertising)
advertising_augment <- augment(advertising_fit, advertising)
advertising_augment |>
ggplot(aes(TV, Sales)) +
geom_segment(aes(xend = TV, yend = .fitted), color = "black", linetype = "solid", linewidth = 0.25) +
geom_point(color = "red", shape = "circle") +
geom_smooth(formula = y ~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_light()\[ \text{RSS} = \sum_{i=1}^{n} \left (y_i - \hat{y}_i \right )^2 \]
The least squares approach chooses \(\hat{\beta}_0\) and \(\hat{\beta}_1\) to minimize the RSS.
\[
RSS = \sum_{i=1}^{n} \left (y_i - (\hat{\beta_0} + \hat{\beta_1}x_i) \right )^2
\] 1. Take the partial derivative of the RSS with respect to \(\beta_0\) and \(\beta_1\)
2. Set the derivatives to zero
3. Solve the resulting system of two equations with two unknowns, \(\beta_0\) and \(\beta_1\)
\[ \hat{\beta}_1 = \frac{\sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \]
\[ \hat{\beta}_0 = \bar{y} - \hat{\beta}_1 \bar{x} \]
Call:
lm(formula = Sales ~ TV, data = advertising)
Residuals:
Min 1Q Median 3Q Max
-8.3860 -1.9545 -0.1913 2.0671 7.2124
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.032594 0.457843 15.36 <2e-16 ***
TV 0.047537 0.002691 17.67 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
# Create a grid of beta_0 and beta_1 values
beta_0 <- seq(5, 10, length.out = 100)
beta_1 <- seq(0.02, 0.08, length.out = 100)
grid <- expand.grid(beta_0 = beta_0, beta_1 = beta_1)
# Calculate RSS for each combination of beta_0 and beta_1
grid$RSS <- apply(grid, 1, function(row) {
beta_0 <- row["beta_0"]
beta_1 <- row["beta_1"]
sum((advertising$Sales - (beta_0 + beta_1 * advertising$TV))^2)
})
# Create a contour plot
ggplot(grid, aes(x = beta_0, y = beta_1, z = RSS/1000)) +
geom_contour_filled(breaks = seq(2.1, 3, 0.05)) +
geom_point(aes(x = coef(advertising_fit)[1], y = coef(advertising_fit)[2]), color = "red", size = 2) +
labs(title = "RSS Contour Plot",
x = expression(beta[0]),
y = expression(beta[1]),
z = "RSS") +
theme_light()\[ \text{Var}(\hat{\beta}_1) = \frac{\sigma^2}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \]
\[ \text{Var}(\hat{\beta}_0) = \sigma^2 \left [ \frac{1}{n} + \frac{\bar{x}^2}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \right ] \]
where \(\sigma^2\) is the variance of the error term \(\epsilon\).
# Calculate the standard errors of the coefficient estimates
se <- sqrt(diag(vcov(advertising_fit)))
# Calculate the t-statistics
t_stat <- coef(advertising_fit) / se
# Calculate the p-values
p_value <- 2 * pt(abs(t_stat), df = nrow(advertising) - 2, lower.tail = FALSE)
# Create a data frame with the results
results <- data.frame(
term = names(coef(advertising_fit)),
estimate = coef(advertising_fit),
std_error = se,
t_statistic = t_stat,
p_value = p_value
)
# Print the results
results
Call:
lm(formula = Sales ~ TV, data = advertising)
Residuals:
Min 1Q Median 3Q Max
-8.3860 -1.9545 -0.1913 2.0671 7.2124
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.032594 0.457843 15.36 <2e-16 ***
TV 0.047537 0.002691 17.67 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
# Calculate the confidence intervals for the coefficients from stats pacakge
conf_int <- confint(advertising_fit)
# Create a data frame with the results
results <- tibble(
term = names(coef(advertising_fit)),
estimate = coef(advertising_fit),
lower = conf_int[, 1],
upper = conf_int[, 2]
)
# Print the results
resultsadvertising_augment |>
ggplot(aes(x = TV, y = Sales)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_smooth(formula = y ~ x, method = "lm", se = TRUE, color = "blue", size = 1) +
labs(x = "TV", y = "Sales",
title = "Scatter Plot of Advertising Data",
subtitle = "se = TRUE") +
scale_y_continuous(breaks = seq(0, 30, 5), limits = c(0, 30)) +
theme_light()advertising_augment |>
ggplot(aes(x = TV, y = Sales)) +
geom_point(color = "red", shape = "circle open", stroke = 1) +
geom_smooth(formula = y ~ x, method = "lm", se = TRUE, color = "blue", size = 1) +
geom_function(fun = function(x) predict(advertising_fit, newdata = data.frame(TV = x), interval = "predict")[, "lwr"],
color = "black", linetype = "dashed") +
geom_function(fun = function(x) predict(advertising_fit, newdata = data.frame(TV = x), interval = "predict")[, "upr"],
color = "black", linetype = "dashed") +
labs(x = "TV", y = "Sales",
title = "Scatter Plot of Advertising Data",
subtitle = "Demonstration of confidence and prediction intervals") +
scale_y_continuous(breaks = seq(0, 30, 5), limits = c(0, 30)) +
theme_light()\[ R^2 = \frac{\text{Explained Variance}}{\text{Total Variance}} \]
\[ R^2 = \frac{TSS - RSS}{TSS} \quad \text{or} \quad R^2 = 1 - \frac{RSS}{TSS} \]
where TSS is the total sum of squares and RSS is the residual sum of squares.
\[
TSS = \sum_{i=1}^{n} (y_i - \bar{y})^2 \quad \text{and} \quad RSS = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2
\]
\(R^2\) is the proportion of the variance in the response variable that is predictable from the predictor variable and takes values between 0 and 1.
\[ \text{Adjusted } R^2 = 1 - \frac{RSS / (n - d -1)}{TSS / (n - 1)} \] where \(n\) is the number of observations and \(d\) is the number of predictors in the model.
p1 <- advertising |>
ggplot() +
geom_point(aes(x = TV, y = Sales), shape = "circle open", color = "red", size = 2, stroke = 1) +
geom_smooth(aes(x = TV, y = Sales), formula = y~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
labs(x = "TV", y = "Sales") +
scale_color_viridis_c() +
theme_light()
p2 <- advertising |>
ggplot() +
geom_point(aes(x = Radio, y = Sales), shape = "circle open", color = "red", size = 2, stroke = 1) +
geom_smooth(aes(x = Radio, y = Sales), formula = y~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
labs(x = "Radio", y = "Sales") +
scale_color_viridis_c() +
theme_light()
p3 <- advertising |>
ggplot() +
geom_point(aes(x = Newspaper, y = Sales), shape = "circle open", color = "red", size = 2, stroke = 1) +
geom_smooth(aes(x = Newspaper, y = Sales), formula = y~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
labs(x = "Newspaper", y = "Sales") +
scale_color_viridis_c() +
theme_light()
p1 + p2 + p3
Call:
lm(formula = Sales ~ TV + Radio + Newspaper, data = advertising)
Residuals:
Min 1Q Median 3Q Max
-8.8277 -0.8908 0.2418 1.1893 2.8292
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.938889 0.311908 9.422 <2e-16 ***
TV 0.045765 0.001395 32.809 <2e-16 ***
Radio 0.188530 0.008611 21.893 <2e-16 ***
Newspaper -0.001037 0.005871 -0.177 0.86
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
advertising_all_augment <- augment(advertising_all_fit, advertising)
p1 <- advertising_all_augment |>
ggplot(aes(x = Sales, y = .fitted)) +
geom_point(color = "red", shape = "circle") +
geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
labs(x = "Actual Sales", y = "Predicted Sales") +
scale_x_continuous(breaks = seq(0, 30, 5)) +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_light()
p2 <- advertising_all_augment |>
ggplot(aes(x = Sales, y = .resid)) +
geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
geom_point(color = "red", shape = "circle") +
labs(x = "Actual Sales", y = "Residuals") +
scale_x_continuous(breaks = seq(0, 30, 5)) +
scale_y_continuous(limits = c(-10, 10), breaks = seq(-10, 10, 5)) +
theme_light()
p1 / p2 +
plot_layout(heights = c(3, 1), axes = "collect")We can consider the interaction between features in the model. Mathematically, this would be equivalent to adding a new feature that is the product of two (or more) features.
Fore example, we can create a new column in the dataset that is the product of the TV and Radio columns, TV * Radio; TV and Newspaper, TV * Newspaper; and Radio and Newspaper, Radio * Newspaper.
We can also consider the product of all three columns, TV * Radio * Newspaper.
\[ \begin{multline} \text{Sales} \approx \beta_0 + \beta_1 \times \text{TV} + \beta_2 \times \text{Radio} + \beta_3 \times \text{Newspaper} \\+ \beta_4 \times \text{TV} \times \text{Radio} + \beta_5 \times \text{TV} \times \text{Newspaper} + \beta_6 \times \text{Radio} \times \text{Newspaper} \\+ \beta_7 \times \text{TV} \times \text{Radio} \times \text{Newspaper} \end{multline} \]
Call:
lm(formula = Sales ~ TV * Radio * Newspaper, data = advertising)
Residuals:
Min 1Q Median 3Q Max
-5.8955 -0.3883 0.1938 0.5865 1.5240
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.556e+00 4.655e-01 14.083 < 2e-16 ***
TV 1.971e-02 2.719e-03 7.250 9.95e-12 ***
Radio 1.962e-02 1.639e-02 1.197 0.233
Newspaper 1.311e-02 1.721e-02 0.761 0.447
TV:Radio 1.162e-03 9.753e-05 11.909 < 2e-16 ***
TV:Newspaper -5.545e-05 9.326e-05 -0.595 0.553
Radio:Newspaper 9.063e-06 4.831e-04 0.019 0.985
TV:Radio:Newspaper -7.610e-07 2.700e-06 -0.282 0.778
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9406 on 192 degrees of freedom
Multiple R-squared: 0.9686, Adjusted R-squared: 0.9675
F-statistic: 847.3 on 7 and 192 DF, p-value: < 2.2e-16
advertising_crossterms_augment <- augment(advertising_crossterms_fit, advertising)
p1 <- advertising_crossterms_augment |>
ggplot(aes(x = Sales, y = .fitted)) +
geom_point(color = "red", shape = "circle") +
geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
labs(x = "Actual Sales", y = "Predicted Sales") +
scale_x_continuous(breaks = seq(0, 30, 5)) +
scale_y_continuous(breaks = seq(0, 30, 5)) +
theme_light()
p2 <- advertising_crossterms_augment |>
ggplot(aes(x = Sales, y = .resid)) +
geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
geom_point(color = "red", shape = "circle") +
labs(x = "Actual Sales", y = "Residuals") +
scale_x_continuous(breaks = seq(0, 30, 5)) +
scale_y_continuous(limits = c(-10, 10), breaks = seq(-10, 10, 5)) +
theme_light()
p1 / p2 +
plot_layout(heights = c(3, 1), axes = "collect")UCI Machine Learning Repository
Using the Concrete_Data.csv dataset, perform a linear regression analysis to predict the compressive strength of concrete using one or more of the other features in the dataset.
What was the best Adjusted \(R^2\) value you achieved?
# New names for columns
concrete_names <- c("Cement", "Blast_Furnace_Slag", "Fly_Ash", "Water", "Superplasticizer", "Coarse_Aggregate", "Fine_Aggregate", "Age", "Compressive_Strength")
# Load the dataset
library(readxl)
concrete_data <- read_xls("datasets/concrete+compressive+strength/Concrete_Data.xls",
col_names = concrete_names,
skip = 1)
concrete_data
Call:
lm(formula = Compressive_Strength ~ Cement + Superplasticizer,
data = concrete_data)
Residuals:
Min 1Q Median 3Q Max
-33.949 -10.032 -0.515 9.117 43.676
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.190242 1.250302 7.35 4.03e-13 ***
Cement 0.074794 0.004036 18.53 < 2e-16 ***
Superplasticizer 0.902460 0.070603 12.78 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.47 on 1027 degrees of freedom
Multiple R-squared: 0.3511, Adjusted R-squared: 0.3498
F-statistic: 277.8 on 2 and 1027 DF, p-value: < 2.2e-16
Call:
lm(formula = Compressive_Strength ~ ., data = concrete_data)
Residuals:
Min 1Q Median 3Q Max
-28.653 -6.303 0.704 6.562 34.446
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -23.163756 26.588421 -0.871 0.383851
Cement 0.119785 0.008489 14.110 < 2e-16 ***
Blast_Furnace_Slag 0.103847 0.010136 10.245 < 2e-16 ***
Fly_Ash 0.087943 0.012585 6.988 5.03e-12 ***
Water -0.150298 0.040179 -3.741 0.000194 ***
Superplasticizer 0.290687 0.093460 3.110 0.001921 **
Coarse_Aggregate 0.018030 0.009394 1.919 0.055227 .
Fine_Aggregate 0.020154 0.010703 1.883 0.059968 .
Age 0.114226 0.005427 21.046 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.4 on 1021 degrees of freedom
Multiple R-squared: 0.6155, Adjusted R-squared: 0.6125
F-statistic: 204.3 on 8 and 1021 DF, p-value: < 2.2e-16
concrete_augment_a <- augment(concrete_fit_a, concrete_data)
concrete_augment_b <- augment(concrete_fit_b, concrete_data)
p1 <- concrete_augment_a |>
ggplot(aes(x = Compressive_Strength, y = .fitted)) +
geom_point(color = "red", shape = "circle", alpha = 1/5) +
geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
labs(x = "Actual Compressive Strength", y = "Predicted Compressive Strength") +
scale_x_continuous(breaks = seq(0, 100, 10)) +
scale_y_continuous(breaks = seq(0, 100, 10), limits = c(0, 80)) +
labs(title = "Cement and Superplasticizer") +
theme_light()
p2 <- concrete_augment_b |>
ggplot(aes(x = Compressive_Strength, y = .fitted)) +
geom_point(color = "red", shape = "circle", alpha = 1/5) +
geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
labs(x = "Actual Compressive Strength", y = "Predicted Compressive Strength") +
scale_x_continuous(breaks = seq(0, 100, 10)) +
scale_y_continuous(breaks = seq(0, 100, 10), limits = c(0, 80)) +
labs(title = "All Features") +
theme_light()
p1 + p2 +
plot_layout(axes = "collect")Applied Statistical Techniques